ApoE4 dose effects on serum metabolome in Alzheimer’s Disease

a Data Science approach

George Miliarakis, MSc Thesis

18-03-2024

Data Exploration

The (interactive) correlation heatmap reveals very high correlation among aminoacids and TG compounds.

load("data/data.Rdata")
X <- df[df$Diagnosis == "Probable AD", 1:230]
C <- round(cor(X), 2)
heatmaply_cor(C, color = "magma", plot_method = "plotly", dendrogram = F, reorderfun = sort.default(d, w), main = "Correlation Heatmap", file = "heatmap.html", colorbar_thickness = 15, colorbar_len = 0.5)

Differential expression of metabolites per ApoE genotype

Global test in AD

Performing a Global Test of ApoE4 dose-effects, on serum metabolites of AD patients, correcting for sex (Ho: ApoE4 dose has no effect on mean metabolite levels, Ha: it has an effect), yields a significant difference (p = 0.0171). The most significantly affected metabolites are triglycerides and diglycerides (FDR-adjusted p-value <0.05).

AD <- subset(df, Diagnosis == "Probable AD")
AD$E4dose <- as.factor(AD$E4dose)

gt <- globaltest::gt(E4dose ~ 1, E4dose ~ . - E4 - Diagnosis - target, data = AD, standardize = TRUE)
gt
  p-value Statistic Expected Std.dev #Cov
1  0.0171      1.23      0.8   0.168  232
covariates <- covariates(gt, sort = T)

res.gt <- extract(covariates) %>%
  sort() %>%
  p.adjust(, method = "FDR") %>%
  globaltest::result() %>%
  mutate_if(is.numeric, round, digits = 3)
res.gt$direction <- as.factor(res.gt$direction)
levels(res.gt$direction) <- c("no ApoE4", "1 ApoE4", "2 ApoE4")
res.gt$alias <- NULL
res.gt <- res.gt[, 1:4]
names(res.gt) <- c("Inheritance", "Assoc. with", "FDR", "p-value")
res.gt <- res.gt[, c(1:(ncol(res.gt) - 2), ncol(res.gt), (ncol(res.gt) - 1))]
filter(res.gt, `p-value` < 0.05)
                              Inheritance Assoc. with p-value   FDR
Lip.TG.56.2.                        0.046     1 ApoE4   0.000 0.016
Lip.TG.58.1.                        0.117     1 ApoE4   0.000 0.021
Lip.DG.36.3.                        0.190     1 ApoE4   0.000 0.021
Lip.TG.56.3.                        0.244     1 ApoE4   0.000 0.021
Lip.TG.52.3.                        0.424     1 ApoE4   0.001 0.031
Lip.TG.54.5.                        0.480     1 ApoE4   0.001 0.027
Lip.TG.58.2.                        0.722     1 ApoE4   0.001 0.027
Lip.TG.54.4.                        0.761     1 ApoE4   0.002 0.033
Lip.TG.58.9.                        1.000     1 ApoE4   0.001 0.027
Lip.TG.56.7.                        1.000     1 ApoE4   0.001 0.027
Lip.TG.58.8.                        1.000     1 ApoE4   0.001 0.032
Lip.TG.54.6.                        1.000     1 ApoE4   0.002 0.035
Lip.TG.56.8.                        1.000     1 ApoE4   0.002 0.037
Lip.TG.52.4.                        1.000     1 ApoE4   0.003 0.055
Lip.TG.54.3.                        1.000     1 ApoE4   0.004 0.055
Lip.TG.56.6.                        1.000     1 ApoE4   0.004 0.059
Lip.TG.56.1.                        1.000     1 ApoE4   0.004 0.059
Lip.TG.52.2.                        1.000     1 ApoE4   0.005 0.063
Lip.TG.54.2.                        1.000     1 ApoE4   0.005 0.063
Lip.TG.60.2.                        1.000     1 ApoE4   0.007 0.077
Lip.TG.58.10.                       1.000     1 ApoE4   0.011 0.124
Lip.TG.51.3.                        1.000     1 ApoE4   0.015 0.154
Lip.SM.d18.1.18.1.                  1.000     2 ApoE4   0.018 0.169
OA.OA09...2.ketoglutaric.acid       1.000     2 ApoE4   0.019 0.169
Lip.TG.52.0.                        1.000     1 ApoE4   0.019 0.169
Lip.TG.50.3.                        1.000     2 ApoE4   0.020 0.169
OA.OA29...Uracil                    1.000    no ApoE4   0.020 0.169
Lip.TG.52.5.                        1.000     1 ApoE4   0.021 0.174
Lip.TG.54.0.                        1.000     1 ApoE4   0.022 0.174
Lip.TG.50.2.                        1.000     2 ApoE4   0.022 0.174
Lip.TG.51.2.                        1.000     1 ApoE4   0.023 0.174
Am.L.Glutamine                      1.000    no ApoE4   0.024 0.174
Lip.TG.50.1.                        1.000     2 ApoE4   0.025 0.174
Lip.TG.50.4.                        1.000     1 ApoE4   0.026 0.176
Lip.TG.52.1.                        1.000     1 ApoE4   0.027 0.176
Lip.TG.54.1.                        1.000     1 ApoE4   0.031 0.203
Lip.TG.50.0.                        1.000     1 ApoE4   0.037 0.229
OA.OA08...Malic.acid                1.000     2 ApoE4   0.038 0.234
Lip.DG.36.2.                        1.000     1 ApoE4   0.039 0.235
OS.HpH.PAF.C16.0                    1.000     2 ApoE4   0.049 0.283

Nested Linear Models

Metabolite-level models to test for ApoE4 dose effects, correcting for age at diagnosis, sex, smoking status, alcohol consumption, hypertension (and medication), hyperlipidemia (and medication), anticoagulant medication, anti-depressants, mean arterial pressure (MAP) and body mass index (BMI). The F-tests yielded p-values < 0.05, however none of them survived FDR correction. Interestingly, in the SCD group, aminoacids are mostly affected and no TGs or DGs compared to the AD group.

nested <- function(Y, x, ...) {
  ### Analysis of Variance (ANOVA)
  x <- as.factor(x)
  df <- cbind(Y, x, ...)
  ncol <- ncol(Y)
  covariates <- names(...)
  F_tests <- furrr::future_map(df[, 1:ncol], ~ {
    frm <- as.formula(paste0(".x ~", paste(covariates, collapse = "+")))
    rm <- lm(frm, data = df)
    ffm <- as.formula(paste0(".x ~ x +", paste(covariates, collapse = "+")))
    fm <- lm(ffm, data = df)
    anova(rm, fm)
  })
  # Correction for Multiple Testing
  # Create a list to store the p-values
  # Extract the p-values of the F-tests from the anova summaries list and store them in p_values
  p_values <- F_tests %>%
    future_map(~ .x[["Pr(>F)"]][[2]])
  # Coerce p_values to dataframe and transpose it
  p_values <- as.data.frame(p_values) %>%
    t() %>%
    data.frame(row.names = colnames(Y))
  # Calculate the FDR-adjusted p-values
  p_values$p_adj <- p.adjust(p_values[, ], method = "fdr", n = 230)
  p_values <- round(p_values, 3)
  # Filter out the non-significant (a=0.05) FDR-adjusted p-values
  sig <- as.data.frame(dplyr::filter(p_values, `.` < 0.05))
  sig <- sig[order(sig$p_adj), ]
  names(sig) <- c("P(>F)", "FDR")
  cat("Significant F-tests: \n")
  print(sig)
  cat("\nDose effects on metabolites:")
  summaries <- furrr::future_map(df[, rownames(sig)], ~ {
    f <- as.formula(paste0(".x ~ x +", paste(covariates, collapse = "+")))
    mdl <- lm(f, data = df)
    summary(mdl)$coefficients[1:length(table(x)), ]
  })
  summaries <- lapply(summaries, function(x) {
    x <- x[, -c(2, 3)]
    x <- t(x)
  })
  summaries <- plyr::ldply(summaries, id = 1:2) %>%
    mutate(across(where(is.numeric), round, digits = 3))
  coeffs <- summaries[c(TRUE, FALSE), ]
  pvalues <- summaries[c(FALSE, TRUE), ]
  t_tests <- cbind.data.frame(coeffs[, 1], coeffs[, 2], pvalues[, 2], coeffs[, 3], pvalues[, 3], coeffs[, 4], pvalues[, 4])
  names(t_tests) <- c("Metabolite", "No ApoE4", "P(>t)", "ApoE4x1", "P(>t)", "ApoE4x2", "P(>t)")
  return(t_tests)
}
Y <- df[, 1:230]
df$E4dose <- as.numeric(df$E4dose) - 1
df$E4dose[df$E4dose == 0] <- "No ApoE4"
df$E4dose[df$E4dose == 1] <- "1 ApoE4"
df$E4dose[df$E4dose == 2] <- "2 ApoE4"
E4dose <- df$E4dose <- as.factor(df$E4dose)

AD group

Among AD patients, ApoE4 dose seems to have positive effect on several triglycerides, diglycerides, putrescine, 2-ketoglutraric acid, lysophosphatidylcholin, HpH.PAF.-C16.0 and -C18.0

load("data/clinical.Rdata")
AD <- subset(df, Diagnosis == "Probable AD")
ADY <- AD[, 1:230]
ADE4dose <- AD$E4dose
ADclinical <- clinical[df$Diagnosis == "Probable AD", ]

## Testing for effects of ApoE4 dose
nested(ADY, relevel(ADE4dose, ref = "No ApoE4"), ADclinical)
Significant F-tests: 
                              P(>F)   FDR
Lip.TG.52.3.                  0.001 0.156
Lip.TG.52.4.                  0.003 0.156
Lip.DG.36.3.                  0.002 0.156
OS.HpH.PAF.C16.0              0.002 0.156
Lip.TG.52.2.                  0.006 0.255
Lip.TG.54.5.                  0.010 0.373
Lip.TG.50.2.                  0.012 0.398
Lip.TG.50.1.                  0.018 0.450
Lip.TG.54.4.                  0.020 0.450
Lip.TG.54.6.                  0.017 0.450
Am.Putrescine                 0.029 0.515
OA.OA09...2.ketoglutaric.acid 0.026 0.515
Lip.TG.50.3.                  0.028 0.515
Lip.TG.52.5.                  0.042 0.538
Lip.TG.56.7.                  0.042 0.538
Lip.TG.56.8.                  0.044 0.538
Lip.TG.58.9.                  0.042 0.538
Lip.LPC.16.0.                 0.033 0.538
OS.HpH.LPA.C18.0              0.044 0.538

Dose effects on metabolites:
                      Metabolite No ApoE4 P(>t) ApoE4x1 P(>t) ApoE4x2 P(>t)
1                   Lip.TG.52.3.   -1.443 0.818   2.568 0.004   3.719 0.001
2                   Lip.TG.52.4.    0.620 0.888   1.628 0.008   2.392 0.002
3                   Lip.DG.36.3.    0.026 0.631   0.019 0.012   0.031 0.001
4               OS.HpH.PAF.C16.0   34.751 0.000   2.593 0.017   4.645 0.001
5                   Lip.TG.52.2.   -5.080 0.469   2.124 0.030   3.744 0.002
6                   Lip.TG.54.5.    1.863 0.496   0.928 0.016   1.280 0.006
7                   Lip.TG.50.2.   -4.499 0.297   0.882 0.141   2.193 0.003
8                   Lip.TG.50.1.   -2.324 0.539   0.706 0.179   1.837 0.005
9                   Lip.TG.54.4.    3.457 0.318   1.178 0.015   1.369 0.020
10                  Lip.TG.54.6.   -0.434 0.795   0.515 0.027   0.746 0.009
11                 Am.Putrescine    0.004 0.000   0.000 0.035   0.000 0.017
12 OA.OA09...2.ketoglutaric.acid    0.008 0.366   0.000 0.953   0.003 0.014
13                  Lip.TG.50.3.   -1.203 0.668   0.594 0.127   1.268 0.008
14                  Lip.TG.52.5.    0.459 0.789   0.358 0.134   0.725 0.014
15                  Lip.TG.56.7.   -2.394 0.095   0.487 0.015   0.392 0.105
16                  Lip.TG.56.8.   -1.393 0.055   0.248 0.014   0.175 0.151
17                  Lip.TG.58.9.   -0.447 0.068   0.085 0.013   0.034 0.410
18                 Lip.LPC.16.0.    4.220 0.028   0.312 0.237   0.850 0.009
19              OS.HpH.LPA.C18.0    0.498 0.038   0.044 0.178   0.101 0.013

SCD group

Among individuals with SCD, lipid metabolites were not affected as much as in the AD group, with only two sphingomyelin species showing a difference. Aminoacids L-serine, tryptophan, glycine,trytptophan, L-homoserine, putrescine were affected in this group.

SCD <- df[df$Diagnosis == "Subjectieve klachten", ]
SCDY <- SCD[, 1:230]
SCDE4dose <- SCD$E4dose
SCDclinical <- clinical[df$Diagnosis == "Subjectieve klachten", ]

## Testing for effects of ApoE4 dose
nested(SCDY, relevel(SCDE4dose, ref = "No ApoE4"), SCDclinical)
Significant F-tests: 
                   P(>F)   FDR
Am.L.Serine        0.002 0.285
Am.L.Tryptophan    0.002 0.285
Am.Glycine         0.006 0.450
Am.L.homoserine    0.047 0.931
Am.Putrescine      0.045 0.931
Lip.SM.d18.1.22.0. 0.043 0.931
Lip.SM.d18.1.23.0. 0.029 0.931

Dose effects on metabolites:
          Metabolite No ApoE4 P(>t) ApoE4x1 P(>t) ApoE4x2 P(>t)
1        Am.L.Serine    4.669 0.000   0.191 0.115  -1.058 0.003
2    Am.L.Tryptophan    3.747 0.004  -0.307 0.047  -1.364 0.002
3         Am.Glycine    4.332 0.001   0.382 0.015  -0.873 0.052
4    Am.L.homoserine    0.003 0.001   0.000 0.723  -0.001 0.014
5      Am.Putrescine    0.000 0.969   0.001 0.013   0.000 0.927
6 Lip.SM.d18.1.22.0.    3.337 0.002   0.309 0.015   0.272 0.448
7 Lip.SM.d18.1.23.0.    1.241 0.010   0.144 0.012   0.176 0.277

Multi-class classification of ApoE4 presence and AD

The discriminative potential of the metabolites, correcting for clinical background features, on AD and ApoE4 or not (4-class response ADE4, AD, SCDE4, SCD) is assessed by first fitting a Multi-nomial Logistic Regression (benchmark model). The benchmark model fits only the clinical features. The model’s AUC is compared with the same model (with altered hyperparameters) fitting clinical variables + 230 metabolites. The metabolites are then projected to 6 latent ML-estimated factors and 3 more models are fitted: Multi-nomial Logistic Regression, Decision Tree and XGBoost fitting clinical variables + 6 factors. The models’ multi-class classification performance is compared using confusion matrices and AUCs.

multifit <- function(X,
                     y,
                     model,
                     ctrl = NULL,
                     grid = NULL,
                     seed = 87654, ...) {
  set.seed(seed)
  # Merge X and y into df
  df <- cbind.data.frame(X, y)
  # Train the model
  model <- caret::train(df[, 1:ncol(X)], df$y,
    method = model,
    tuneGrid = grid,
    trControl = ctrl,
    metric = "logLoss",
    # importance=TRUE,
    ...
  )
  obs <- model$pred$obs
  preds <- model$pred$pred

  # Get the multi-class ROC curves
  ys <- as.numeric(obs) - 1
  yhats <- as.numeric(preds) - 1
  roc <- multiclass.roc(
    response = ys,
    predictor = yhats, quiet = T, legacy.axes = T
  )
  print(roc$auc)
  names <- c(
    "ADE4 vs. AD", "ADE4 vs. SCDE4",
    "ADE4 vs. SCD", "AD vs SCDE4", "AD vs. SCD", "SCDE4 vs. SCD"
  )
  for (i in 1:length(names)) {
    names[[i]] <- paste0(
      names[[i]], ", AUC=",
      paste(round(auc(roc$rocs[[i]]), 2))
    )
  }
  names(roc$rocs) <- names
  cat("\n")
  # Create a confusion matrix and get performance metrics from caret
  cm <- confusionMatrix(reference = obs, data = preds, mode = "everything")
  print(cm)
  out <- list("model"= model, "roc"= roc)
  return(out)
}
load("data/data.Rdata")
X <- scale(df[, 1:230])
y <- df$target

Xclin <- cbind.data.frame(X, clin_dummy)


ctrl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 1,
  savePredictions = "final",
  classProbs = T,
  summaryFunction = multiClassSummary,
  selectionFunction = best,
  search = "random",
  sampling = "smote"
)

Fitting a benchmark model: clinical characteristics only

benchmark <- multifit(
  X = clin_dummy,
  y = y,
  model = "multinom",
  ctrl = ctrl,
  grid = expand.grid(decay = 0),
  trace = F
)
Multi-class area under the curve: 0.8078

Confusion Matrix and Statistics

          Reference
Prediction AD ADE4 SCD SCDE4
     AD    47   22   0     2
     ADE4  37    9   0     2
     SCD    0    2  20    43
     SCDE4  2    1  20    40

Overall Statistics
                                          
               Accuracy : 0.4696          
                 95% CI : (0.4061, 0.5339)
    No Information Rate : 0.3522          
    P-Value [Acc > NIR] : 9.621e-05       
                                          
                  Kappa : 0.284           
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity             0.5465     0.26471    0.50000       0.4598
Specificity             0.8509     0.81690    0.78261       0.8562
Pos Pred Value          0.6620     0.18750    0.30769       0.6349
Neg Pred Value          0.7784     0.87437    0.89011       0.7446
Precision               0.6620     0.18750    0.30769       0.6349
Recall                  0.5465     0.26471    0.50000       0.4598
F1                      0.5987     0.21951    0.38095       0.5333
Prevalence              0.3482     0.13765    0.16194       0.3522
Detection Rate          0.1903     0.03644    0.08097       0.1619
Detection Prevalence    0.2874     0.19433    0.26316       0.2551
Balanced Accuracy       0.6987     0.54080    0.64130       0.6580
g <- ggroc(benchmark$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)

Regularized Multinomial Regression adding 230 metabolites

mlr <- multifit(
  X = Xclin,
  y = y,
  model = "multinom",
  ctrl = ctrl,
  grid = expand.grid(decay = 9.187724),
  trace = F,
  importance=T
)
Multi-class area under the curve: 0.8323

Confusion Matrix and Statistics

          Reference
Prediction AD ADE4 SCD SCDE4
     AD    61   24   0     0
     ADE4  25    9   0     1
     SCD    0    0  17    30
     SCDE4  0    1  23    56

Overall Statistics
                                          
               Accuracy : 0.5789          
                 95% CI : (0.5147, 0.6413)
    No Information Rate : 0.3522          
    P-Value [Acc > NIR] : 3.308e-13       
                                          
                  Kappa : 0.4118          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity             0.7093     0.26471    0.42500       0.6437
Specificity             0.8509     0.87793    0.85507       0.8500
Pos Pred Value          0.7176     0.25714    0.36170       0.7000
Neg Pred Value          0.8457     0.88208    0.88500       0.8144
Precision               0.7176     0.25714    0.36170       0.7000
Recall                  0.7093     0.26471    0.42500       0.6437
F1                      0.7135     0.26087    0.39080       0.6707
Prevalence              0.3482     0.13765    0.16194       0.3522
Detection Rate          0.2470     0.03644    0.06883       0.2267
Detection Prevalence    0.3441     0.14170    0.19028       0.3239
Balanced Accuracy       0.7801     0.57132    0.64004       0.7468
g <- ggroc(mlr$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)
# Get the feature importance scores
fimp <- varImp(mlr$model)

fimp$importance %>%
  arrange(desc(Overall)) %>%
  top_n(20)
                                  Overall
Am.Putrescine                   100.00000
OS.HpH.Spha.1.P.C18.0            93.04344
OA.OA29...Uracil                 91.82792
Lip.PE.O.38.5.                   90.87753
OA.OA17...3.Hydroxybutyric.acid  90.68479
Am.Sarcosine                     86.83056
med_chol_nee                     86.72834
Lip.TG.56.0.                     85.98387
OS.HpH.LPA.C20.5                 77.36226
Am.L.Histidine                   75.27213
Am.Histamine                     74.82199
Lip.PC.38.3.                     74.12450
Am.Glutathione                   72.74617
Lip.SM.d18.1.20.1.               72.52063
Am.Serotonine                    70.53271
OA.OA13...Pyruvic.acid           70.27859
Am.L.Glutamine                   69.94468
OS.HpH.LPA.C18.0                 68.72490
OS.cHpH.aLPA.C16.1               66.13243
Am.L.Valine                      65.82711

Projection to Latent Factors

project <- function(X, m, seed) {
  set.seed(seed)
  X <- scale(as.matrix(df[, 1:230]))
  cov <- cor(X)

  # Find redundant features
  filter <- RF(cov)

  # Filter out redundant features
  filtered <- subSet(X, filter)

  # Regularized correlation matrix estimation
  M <- regcor(filtered)

  # Get the regularized correlation matrix of the filtered dataset
  R <- M$optCor

  mlfa <- mlFA(R, m = 6)
  thomson <- facScore(filtered, mlfa$Loadings, mlfa$Uniqueness)
  return(thomson)
}
thomson <- project(X, 6, seed = 1234)
X <- cbind(clin_dummy, thomson)

Multinomial Logistic Regression adding 6 factors

mlrf <- multifit(
  X = X,
  y = y,
  model = "multinom",
  ctrl = ctrl,
  trace = FALSE,
  grid = expand.grid(decay = 0)
)
Multi-class area under the curve: 0.8096

Confusion Matrix and Statistics

          Reference
Prediction AD ADE4 SCD SCDE4
     AD    51   22   0     1
     ADE4  32    8   0     2
     SCD    1    2  20    32
     SCDE4  2    2  20    52

Overall Statistics
                                          
               Accuracy : 0.5304          
                 95% CI : (0.4661, 0.5939)
    No Information Rate : 0.3522          
    P-Value [Acc > NIR] : 7.906e-09       
                                          
                  Kappa : 0.3548          
                                          
 Mcnemar's Test P-Value : 0.2415          

Statistics by Class:

                     Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity             0.5930     0.23529    0.50000       0.5977
Specificity             0.8571     0.84038    0.83092       0.8500
Pos Pred Value          0.6892     0.19048    0.36364       0.6842
Neg Pred Value          0.7977     0.87317    0.89583       0.7953
Precision               0.6892     0.19048    0.36364       0.6842
Recall                  0.5930     0.23529    0.50000       0.5977
F1                      0.6375     0.21053    0.42105       0.6380
Prevalence              0.3482     0.13765    0.16194       0.3522
Detection Rate          0.2065     0.03239    0.08097       0.2105
Detection Prevalence    0.2996     0.17004    0.22267       0.3077
Balanced Accuracy       0.7251     0.53783    0.66546       0.7239
g <- ggroc(mlrf$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)

Decision Tree

tree <- multifit(
  X = X,
  y = y,
  model = "rpart2",
  ctrl = ctrl,
  grid = expand.grid(maxdepth = 3)
)
Multi-class area under the curve: 0.8076

Confusion Matrix and Statistics

          Reference
Prediction AD ADE4 SCD SCDE4
     AD    55   17   0     1
     ADE4  27   13   1     7
     SCD    2    3  33    59
     SCDE4  2    1   6    20

Overall Statistics
                                        
               Accuracy : 0.4899        
                 95% CI : (0.426, 0.554)
    No Information Rate : 0.3522        
    P-Value [Acc > NIR] : 6.171e-06     
                                        
                  Kappa : 0.3335        
                                        
 Mcnemar's Test P-Value : 1.011e-09     

Statistics by Class:

                     Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity             0.6395     0.38235     0.8250      0.22989
Specificity             0.8882     0.83568     0.6908      0.94375
Pos Pred Value          0.7534     0.27083     0.3402      0.68966
Neg Pred Value          0.8218     0.89447     0.9533      0.69266
Precision               0.7534     0.27083     0.3402      0.68966
Recall                  0.6395     0.38235     0.8250      0.22989
F1                      0.6918     0.31707     0.4818      0.34483
Prevalence              0.3482     0.13765     0.1619      0.35223
Detection Rate          0.2227     0.05263     0.1336      0.08097
Detection Prevalence    0.2955     0.19433     0.3927      0.11741
Balanced Accuracy       0.7639     0.60902     0.7579      0.58682
g <- ggroc(tree$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)

XGBoost Forest

xgb <- multifit(
  X = X,
  y = y,
  model = "xgbTree",
  ctrl = ctrl,
  grid = xgb.grid
)
Multi-class area under the curve: 0.8383

Confusion Matrix and Statistics

          Reference
Prediction AD ADE4 SCD SCDE4
     AD    67   21   0     0
     ADE4  18   11   1     0
     SCD    1    0  19    37
     SCDE4  0    2  20    50

Overall Statistics
                                          
               Accuracy : 0.5951          
                 95% CI : (0.5311, 0.6569)
    No Information Rate : 0.3522          
    P-Value [Acc > NIR] : 6.845e-15       
                                          
                  Kappa : 0.4371          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity             0.7791     0.32353    0.47500       0.5747
Specificity             0.8696     0.91080    0.81643       0.8625
Pos Pred Value          0.7614     0.36667    0.33333       0.6944
Neg Pred Value          0.8805     0.89401    0.88947       0.7886
Precision               0.7614     0.36667    0.33333       0.6944
Recall                  0.7791     0.32353    0.47500       0.5747
F1                      0.7701     0.34375    0.39175       0.6289
Prevalence              0.3482     0.13765    0.16194       0.3522
Detection Rate          0.2713     0.04453    0.07692       0.2024
Detection Prevalence    0.3563     0.12146    0.23077       0.2915
Balanced Accuracy       0.8243     0.61716    0.64571       0.7186
g <- ggroc(xgb$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)
# cvms::plot_confusion_matrix(cmxgb, palette = "Oranges" , theme_fn = ggthemes::theme_tufte)

Model Comparison

aucs <- data.frame(AUC = c("Clinical features only" = benchmark$roc$auc,
"Clinical features + 230 metabolites" = mlr$roc$auc,
"Clinical features + 6 latent factors" = mlrf$roc$auc,
"Decision Tree" = tree$roc$auc, "XGBoost" = xgb$roc$auc))

aucs
                                           AUC
Clinical features only               0.8078106
Clinical features + 230 metabolites  0.8322980
Clinical features + 6 latent factors 0.8095760
Decision Tree                        0.8075950
XGBoost                              0.8382963

Observations:

  1. Adding serum metabolite information (either the full 230-metabolite matrix or its 6-factor projection) seems to increase the discriminatory power of the models.

  2. Fitting 6 ML-estimated factors obtained by the FMradio package (cummulatively explaining 30% of variace) yields increased classification performance, serving as a valuable dimension reduction technique for high-dimensional data.

  3. Looking at the confusion matrix and individual ROC curves, all models were able to discriminate better among certain classes (AD+E4/SCD+E4, AD+E4/SCD, AD-E4/SCD+E4 and AD-E4/SCD-E4) compared to others (AD+E4/AD-E4 and SCD+E4/SCD-E4).

Metabolite Covariance Network Analysis in AD

The covariance network analysis shows distinct metabolic “images” among ApoE4 carriers and non-carriers in AD. The network vertices and edges are distinct.

# Store all observations of ApoE4 non-carriers in C1
C1 <- scale(AD[AD$E4 == 0, 1:230])
# Store all observations of ApoE4 carriers in C2
C2 <- scale(AD[AD$E4 == 1, 1:230])

# Get the covariance matrices of C1 and C2
S1 <- covML(C1)
S2 <- covML(C2)

# Store them in a list
S <- list(S1 = S1, S2 = S2)

# Get the total number of samples
n <- c(nrow(S1), nrow(S2))

# Create a list of fused covariance matrices T
Ts <- default.target.fused(Slist = S, ns = n, type = "DUPV")
# Get the optimal lambdas per class and fused with 10-fold CV
# set.seed(8910)
# optf <- optPenalty.fused(
#   Ylist = Ys,
#   Tlist = Ts,
#   lambda = default.penalty(Ys),
#   cv.method = "kCV",
#   k = 10,
#   verbose = FALSE
# )
# save(optf, file= "data/optf.Rdata")
i = 1   | max diff = 8.0963463215e-01
i = 2   | max diff = 7.3872853063e-29
Converged in 2 iterations, max diff < 1.49e-08.
Step 1... determine cutoff point
Step 2... estimate parameters of null distribution and eta0
Step 3... compute p-values and estimate empirical PDF/CDF
Step 4... compute q-values and local fdr
Step 5... prepare for plotting


- Retained elements:  55 
- Corresponding to 0.21 % of possible edges 
 
Step 1... determine cutoff point
Step 2... estimate parameters of null distribution and eta0
Step 3... compute p-values and estimate empirical PDF/CDF
Step 4... compute q-values and local fdr
Step 5... prepare for plotting


- Retained elements:  205 
- Corresponding to 0.78 % of possible edges 
 

GGMs showing distinct metabolic compositions among ApoE4 carriers (left), non-carriers(middle) and the differential edges (right).

# Merge the sparse high precision matrices
TST <- Union(P0s$S1$sparseParCor, P0s$S2$sparseParCor)
PCE4NO <- TST$M1subset
PCE4YES <- TST$M2subset

# Create a color map per metabolite class
Colors <- rownames(PCE4YES)
Colors[grep("Am", rownames(PCE4YES))] <- "pink"
Colors[grep("OA", rownames(PCE4YES))] <- "lightblue"
Colors[grep("Lip", rownames(PCE4YES))] <- "yellow"
Colors[grep("OS", rownames(PCE4YES))] <- "grey"
# httpgd::hgd( width = getOption("httpgd.width", 700),
#   height = getOption("httpgd.height", 700),)
set.seed(111213)
# Plot he sparsified ridge matrix of ApoE4 carriers with AD
Coords <- Ugraph(PCE4YES,
  type = "fancy", lay = "layout_with_fr",
  Vcolor = Colors, prune = FALSE, Vsize = 5, Vcex = 0.3, cut = 0.5,
  main = "ApoE4 carriers with AD"
)

# plot(NULL, xaxt = "n", yaxt = "n", bty = "n", ylab = "", xlab = "", xlim = 0:1, ylim = 0:1)
# legend("topleft",
#   legend = c(
#     "Amines, aminoacids", "Organic Acids", "Lipids",
#     "Oxidative stress compounds"
#   ), pch = 16, pt.cex = 3, cex = 1.5, bty = "n",
#   col = c("pink", "lightblue", "yellow", "grey")
# )


# names <- c("ApoE4 non-carrier edges", "ApoE4 carrier edges", "positive partial cor.", "negative partial cor.")
# clrs <- c("red", "green", "black", "black")
# ltype <- c(1, 1, 1, 2)
# plot(NULL, xaxt = "n", yaxt = "n", bty = "n", ylab = "", xlab = "", xlim = 0:1, ylim = 0:1)
# legend("topleft",
#   title = "", legend = names, lty = ltype, lwd = 2, cex = 1.5,
#   bty = "n", col = clrs
# )
# Plot the sparsified ridge matrix of ApoE4 non-carriers with AD
Ugraph(PCE4NO,
  type = "fancy", lay = NULL, coords = Coords,
  Vcolor = Colors, prune = FALSE, Vsize = 5, Vcex = 0.3, cut = 0.5,
  main = "ApoE4 non-carriers with AD"
)

# Plot the differential network
diffgraph <- DiffGraph(PCE4NO, PCE4YES,
  lay = NULL, coords = Coords,
  Vcolor = Colors, Vsize = 5, Vcex = 0.3, main = "Differential Network"
)

Distinct metabolite communities among ApoE4 carriers/non carriers

# Get the communities per class
set.seed(141516)
CommC1 <- Communities(PCE4NO,
  Vcolor = Colors, Vsize = 5, Vcex = 0.3,
  main = "ApoE4 non-carriers with AD"
)

CommC2 <- Communities(PCE4YES,
  Vcolor = Colors, Vsize = 5, Vcex = 0.3,
  main = "ApoE4 carriers with AD"
)

PC0list <- list(PCE4NO = PCE4NO, PCE4YES = PCE4YES)
# Get the network statistics
NetStats <- GGMnetworkStats.fused(PC0list)

NetStatsE4yes <- NetStats[, 10:18]
NetStatsE4no <- NetStats[, 1:9]

NetStatsE4yes[, c(3, 9)] <- NetStatsE4no[, c(3, 9)] <- NULL
DegreesE4yes <- as.data.frame(NetStatsE4yes[, 1], rownames(NetStats))
DegreesE4no <- as.data.frame(NetStatsE4no[, 1], row.names = row.names(NetStatsE4no))
head(DegreesE4yes[order(DegreesE4yes[, 1], decreasing = TRUE), , drop = FALSE], 20)
                                   NetStatsE4yes[, 1]
Am.Dopamine                                         7
Am.ADMA                                             5
Am.Citrulline                                       5
Am.L.Kynurenine                                     5
OA.OA27...3.hydroxyisovaleric.acid                  5
OS.HpH.PAF.C16.0                                    5
Am.X1.Methylhistidine                               4
Am.X3.Methoxytyramine                               4
Am.DL.3.aminoisobutyric.acid                        4
Am.Ethanolamine                                     4
Am.L.2.aminoadipic.acid                             4
Am.L.4.hydroxy.proline                              4
Am.SDMA                                             4
OA.OA06...Glycolic.acid                             4
OA.OA09...2.ketoglutaric.acid                       4
OA.OA12...Fumaric.acid                              4
OA.OA17...3.Hydroxybutyric.acid                     4
OA.OA20...Aspartic.acid                             4
OA.OA28...Glyceric.acid                             4
Lip.TG.56.8.                                        4
head(DegreesE4no[order(DegreesE4no[, 1], decreasing = TRUE), , drop = FALSE], 20)
                         NetStatsE4no[, 1]
Am.X3.Methoxytyrosine                    5
OS.LpH.NO2.OA                            4
Am.Cysteine                              3
Am.Dopamine                              3
Am.Glutathione                           3
Am.L.Aspartic.acid                       3
Am.X3.Methylhistidine                    2
Am.Hydroxylysine                         2
Am.L.4.hydroxy.proline                   2
Am.L.carnosine                           2
Am.Methyldopa                            2
OA.OA03...Citric.acid                    2
OA.OA13...Pyruvic.acid                   2
OA.OA20...Aspartic.acid                  2
OA.OA23...Iminodiacetate                 2
OA.OA29...Uracil                         2
Lip.TG.60.2.                             2
Lip.SM.d18.1.24.0.                       2
OS.LpH.NO2.LA                            2
OS.LpH.NO2.aLA                           2

Network statistics

Wilcoxon Signed Rank test between the network statistics of ApoE4 carriers and non-carriers. The test shows distinct centrality degrees and average number of negative and positive edges.

# Perform a Wilcoxon signed rank test
w <- furrr::future_map2(NetStatsE4yes[, 1:7], NetStatsE4no[, 1:7], ~ {
  wilcox.test(.x, .y, paired = TRUE, alternative = "greater")
})
p.values <- names <- list(1:length(w))
for (i in 1:length(w)) {
  names[[i]] <- names(w[[i]])[7:length(names(w[[i]]))]
  p.values[[i]] <- w[[i]][["p.value"]]
}
names(p.values) <- names(w)
d <- as.data.frame(p.values)
d <- t(d)
d <- cbind.data.frame(d, p.adjust(d, method = "fdr"))
rownames(d) <- sub("PCE4YES.", "", row.names(d))
names(d) <- c("p-value", "FDR")
d
                     p-value          FDR
degree          7.905289e-27 1.844567e-26
betweenness     5.439808e-18 6.346442e-18
eigenCentrality 3.626367e-25 6.346142e-25
nNeg            8.837198e-06 8.837198e-06
nPos            1.436859e-24 2.011603e-24
mutualInfo      1.773939e-30 6.411006e-30
variance        1.831716e-30 6.411006e-30